/* (c) 2014 LinkedIn Corp. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use
* this file except in compliance with the License. You may obtain a copy of the
* License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied.
*/
package com.linkedin.cubert.utils;
import com.linkedin.cubert.block.BlockSchema;
import com.linkedin.cubert.block.ColumnType;
import com.linkedin.cubert.block.DataType;
import com.linkedin.cubert.pig.piggybank.storage.avro.AvroSchema2Pig;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.Schema.Type;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.file.SeekableInput;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumReader;
import org.apache.avro.mapred.FsInput;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsAction;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.mapred.JobConf;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.node.NullNode;
/**
* Various utility methods related to Avro Schema.
*
* @author Maneesh Varshney
*
*/
public class AvroUtils
{
private static int arrayElemInSchemaCounter = 0;
private static final boolean PadDefaultNullsToSchema = true;
/**
* Extracts the schema of an Avro file.
*
* @param conf
* @param path
* @return
* @throws IOException
*/
public static Schema getSchema(Configuration conf, Path path) throws IOException
{
FileSystem fs = path.getFileSystem(conf);
Path anAvroFile = FileSystemUtils.getFirstMatch(fs, path, "*.avro", true);
if (anAvroFile == null)
throw new IOException("there are no files in " + path.toString());
System.out.println("Obtaining schema of avro file " + anAvroFile.toString());
return getSchema(new FsInput(anAvroFile, conf));
}
public static Schema getSchema(SeekableInput input) throws IOException
{
DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>();
DataFileReader<GenericRecord> dataFileReader =
new DataFileReader<GenericRecord>(input, datumReader);
Schema schema = dataFileReader.getSchema();
if (PadDefaultNullsToSchema)
{
// a list of "cloned" fields, with optional default value set to null
ArrayList<Field> paddedFields = new ArrayList<Field>();
for (Field field: schema.getFields())
{
// should this field be padded?
boolean needsNullPadding = (field.schema() != null) // the field has nested schema
&& (field.schema().getType().equals(Type.UNION)) // the nested schema is UNION
&& (field.schema().getTypes().get(0).getType().equals(Type.NULL)); // the first element of union is NULL type
JsonNode defValue = needsNullPadding ? NullNode.getInstance() : field.defaultValue();
Field f = new Field(field.name(), field.schema(), field.doc(), defValue);
paddedFields.add(f);
}
schema = Schema.createRecord(schema.getName(), schema.getDoc(), schema.getNamespace(), schema.isError());
schema.setFields(paddedFields);
}
return schema;
}
/**
* Converts a ColumnType array to Avro's Schema.
*
* @param recordName
* @param schema
* @return
*/
public static Schema convertFromBlockSchema(String recordName, BlockSchema schema)
{
arrayElemInSchemaCounter = 0;
return convertFromBlockSchema(recordName, Type.RECORD, schema, true);
}
private static Field[] createFields(BlockSchema schema){
Field[] fields = new Field[schema.getNumColumns()];
for (int idx = 0; idx < fields.length; idx++)
{
final ColumnType col = schema.getColumnType(idx);
final DataType colType = col.getType();
final Type subType = convertToAvroType(colType);
final Schema colSchema;
if (col.getColumnSchema() != null ||
subType == Type.ARRAY || subType == Type.MAP)
{
colSchema =
convertFromBlockSchema(col.getName(),
subType,
col.getColumnSchema(), false);
}
else
{
List<Schema> unionSchema = new ArrayList<Schema>();
unionSchema.add(Schema.create(Type.NULL));
unionSchema.add(Schema.create(subType));
colSchema = Schema.createUnion(unionSchema);
}
fields[idx] = new Field(col.getName(), colSchema, null, null);
}
return fields;
}
private static Schema convertFromBlockSchema(final String name,
final Type type,
final BlockSchema schema,
boolean toplevel)
{
Schema avroSchema;
switch (type)
{
case RECORD:
Field[] fields = createFields(schema);
avroSchema = Schema.createRecord(name, null, null, false);
avroSchema.setFields(Arrays.asList(fields));
if (toplevel)
break;
List<Schema> unionSchema = new ArrayList<Schema>();
unionSchema.add(Schema.create(Type.NULL));
unionSchema.add(avroSchema);
avroSchema = Schema.createUnion(unionSchema);
break;
case ARRAY:
{
if (schema.getNumColumns() != 1)
{
throw new RuntimeException("Type ARRAY must have a single element in the subschema");
}
ColumnType elemColType = schema.getColumnType(0);
Schema elemType;
if (elemColType.getColumnSchema() == null)
{
elemType = Schema.create(convertToAvroType(elemColType.getType()));
}
else
{
elemType =
convertFromBlockSchema(elemColType.getName() + (arrayElemInSchemaCounter++),
convertToAvroType(elemColType.getType()),
elemColType.getColumnSchema(), false);
}
avroSchema = Schema.createArray(elemType);
unionSchema = new ArrayList<Schema>();
unionSchema.add(Schema.create(Type.NULL));
unionSchema.add(avroSchema);
avroSchema = Schema.createUnion(unionSchema);
break;
}
case MAP:
{
ColumnType valueColType = schema.getColumnType(0);
Schema valueType;
if (valueColType.getColumnSchema() == null)
{
valueType = Schema.create(convertToAvroType(valueColType.getType()));
}
else
{
valueType =
convertFromBlockSchema(valueColType.getName(),
convertToAvroType(valueColType.getType()),
valueColType.getColumnSchema(), false);
}
avroSchema = Schema.createMap(valueType);
unionSchema = new ArrayList<Schema>();
unionSchema.add(Schema.create(Type.NULL));
unionSchema.add(avroSchema);
avroSchema = Schema.createUnion(unionSchema);
break;
}
default:
throw new IllegalArgumentException("Unsupported composite Type: " + type);
}
return avroSchema;
}
private static Type convertToAvroType(DataType colType)
{
final Type subType;
if (colType == DataType.TUPLE)
{
/* Pig converts RECORD to TUPLE. Converting it back. */
subType = Type.RECORD;
}
else if (colType == DataType.BAG)
{
subType = Type.ARRAY;
}
else if (colType == DataType.MAP)
{
subType = Type.MAP;
}
else
{
subType = Type.valueOf(colType.toString().toUpperCase());
}
return subType;
}
// Convert to Pig Schema use the utility functions in Pig first, then convert to Block
// Schema.
// Thus only the Pig <-> Cubert schema conversion path is required to be maintained in
// the code.
public static BlockSchema convertToBlockSchema(Schema avroSchema)
{
try
{
org.apache.pig.ResourceSchema pigResourceSchema =
AvroSchema2Pig.convert(avroSchema);
org.apache.pig.impl.logicalLayer.schema.Schema pigSchema =
org.apache.pig.impl.logicalLayer.schema.Schema.getPigSchema(pigResourceSchema);
return SchemaUtils.convertToBlockSchema(pigSchema);
}
catch (IOException e)
{
throw new RuntimeException(e);
}
}
public static void createFileIfNotExists(BlockSchema fileSchema, String path) throws IOException
{
Configuration conf = new JobConf();
FileSystem fs = FileSystem.get(conf);
if (fs.exists(new Path(path)))
return;
Schema avroSchema = convertFromBlockSchema("CUBERT_MV_RECORD", fileSchema);
System.out.println("Creating avro file with schema = " + avroSchema);
GenericDatumWriter<GenericRecord> datumWriter =
new GenericDatumWriter<GenericRecord>(avroSchema);
DataFileWriter<GenericRecord> writer =
new DataFileWriter<GenericRecord>(datumWriter);
FSDataOutputStream fout =
FileSystem.create(fs,
new Path(path),
new FsPermission(FsAction.ALL,
FsAction.READ_EXECUTE,
FsAction.READ_EXECUTE));
writer.create(avroSchema, fout);
writer.flush();
writer.close();
}
public static void main(String[] args) throws IOException
{
JobConf conf = new JobConf();
System.out.println(AvroUtils.getSchema(conf, new Path(args[0])).toString(true));
}
}